compute tf-idf matrix from coocurrency matrix of train and test sets and then use tf-idf matrices to compute cosine-similarity (linear kernel) between entries in train and test set to make predictions. Exchange the predictions made using cosine similiarity for duplicates to values of train set in the end and save result in submission file


In [1]:
import pandas as pd
import numpy as np
import string
import time
from scipy.sparse import *
from scipy.io import mmwrite, mmread
import csv
from bs4 import BeautifulSoup
from nltk.tag import brill
from taggerfunctions import *
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from ast import literal_eval
import sklearn as sk
import gc
import os
import psutil

In [2]:
def getRealDict(fname):
    dictWords = {}
    with open(fname, 'r') as f:
        reader = csv.reader(f)
        dictWords = {literal_eval(rows[0]):rows[1] for rows in reader}
    return dictWords

def getDict(fname):
    dictWords = {}
    with open(fname, 'r') as f:
        reader = csv.reader(f)
        dictWords = {rows[0]:literal_eval(rows[1]) for rows in reader}
    return pd.Series(dictWords)

def getInvDict(fname):
    dictWords = {}
    with open(fname, 'r') as f:
        reader = csv.reader(f)
        invDictWords = {literal_eval(rows[0]):rows[1] for rows in reader}
    return pd.Series(invDictWords)

In [3]:
def save_results(predictions, filename):
   """Given a vector of predictions, save results in CSV format."""
   with open(filename, 'w') as f:
       f.write("Id,Tags\n")
       for i, pred in predictions.iteritems():
           f.write(str(i) + ",\"" + pred + "\"" + "\n")
compute tf-idf matrix for train set:

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer

In [6]:
coocMatTitle_coo = mmread("coocMatTitleNew_coo.mtx")
coocMatTitle_csr = coocMatTitle_coo.tocsr()

In [4]:
coocMatBodyFull_csr = mmread("coocMatBodyFull2_csr.mtx")
coocMatBodyFull_csr = coocMatBodyFull_csr.tocsr()


Out[4]:
<42048x66586 sparse matrix of type '<type 'numpy.float64'>'
	with 56489281 stored elements in COOrdinate format>

In [9]:
tfidf = TfidfTransformer(norm="l2")
tf_idf_matrix_title = tfidf.fit_transform(coocMatTitle_csr)
tf_idf_matrix_body = tfidf.fit_transform(coocMatBodyFull_csr)

In [10]:
mmwrite("tfidfMatTitle.mtx",tf_idf_matrix_title)
mmwrite("tfidfMatBody.mtx",tf_idf_matrix_body)
compute tf_idf matrix for test set:

In [11]:
testWTitle = mmread("testWordsQTitle_0-200000.mtx")

In [12]:
testWTitle


Out[12]:
<200000x45469 sparse matrix of type '<type 'numpy.float64'>'
	with 1020998 stored elements in COOrdinate format>

In [13]:
tf_idf_matTestTitle = tfidf.fit_transform(testWTitle)

In [15]:
tf_idf_matTestTitle


Out[15]:
<200000x45469 sparse matrix of type '<type 'numpy.float64'>'
	with 1020998 stored elements in Compressed Sparse Row format>

make predicitons:


In [4]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(norm="l2")

In [5]:
tf_idf_title = mmread("tfidfMatTitle.mtx")
tf_idf_title = tf_idf_title.tocsr()

In [6]:
tf_idf_body = mmread("tfidfMatBody.mtx")
tf_idf_body = tf_idf_body.tocsr()
save 20 most likely tags for each post

In [ ]:
frange = ["0-200000","200000-400000","400000-600000","600000-800000","800000-1000000","1000000-1200000","1200000-1400000",
          "1400000-1600000","1600000-1800000","1800000-2000000","2000000-2013337"]
invDictKeys = getInvDict("invdictKeys.csv")
invDictWords = getInvDict("invdictWordsNew.csv")
result_tags = {}
countRows = 0
""" choose a number for chunk_size that is a divisor of the number of rows in testWordsQ; 
    otherwise the iterator will skip the last entries!"""
chunk_size = 1000
tStart = time.time()
for fran in frange:
    fname = "invdictIdTest_" + fran + ".csv"
    """ make sure that invDictIdTest is sorted!!! use getInvDict or sort dict after using getDict """
    invDictIdTest = getInvDict(fname)
    fname = "testWordsQTitle_" + fran + ".mtx"
    testQTitle = mmread(fname)
    testQTitle = testQTitle.tocsr()
    tf_idf_matTestTitle = tfidf.fit_transform(testQTitle)
    #fname = "testWordsQBody_" + fran + ".mtx"
    #testQBody = mmread(fname)
    #testQBody = testQBody.tocsr()
    #tf_idf_matTestBody = tfidf.fit_transform(testQBody)
    for idx_chunk in xrange(len(invDictIdTest)/chunk_size):
        cs_title = linear_kernel(tf_idf_matTestTitle[chunk_size*idx_chunk:chunk_size*(idx_chunk+1),:], tf_idf_title)
        #cs_body = linear_kernel(tf_idf_matTestBody[chunk_size*idx_chunk:chunk_size*(idx_chunk+1),:], tf_idf_body)
        for row_idx,row in enumerate(cs_title):
            rel_tags = row.argsort()[:-20:-1]
            ar = []
            words = []       
            for idx in testQTitle[row_idx,:].nonzero()[1]:
                words.append(invDictWords[idx])
            for idx in rel_tags:
                if invDictKeys[idx] in words:
                    ar.append(invDictKeys[idx])
            for idx in rel_tags[0:2]:
                if invDictKeys[idx] not in ar:
                    ar.append(invDictKeys[idx])
            result_tags[invDictIdTest[countRows%200000]] = " ".join(ar)
            countRows += 1     
        del cs_title
        gc.collect()
        if countRows % 20000 == 0:
            print("{0:d} questions finished in {1:.0f}s".format(countRows, time.time()-tStart))
            tStart = time.time()
""" the linear_kernel below is necessary because matrix size 13337 is not divisible by chunk_size;
    it will skip the last part in the for loop above and i have to do it manually below """
cs_title = linear_kernel(tf_idf_matTestTitle[len(invDictIdTest) - len(invDictIdTest)%chunk_size:,:], tf_idf_title)
#cs_body = linear_kernel(tf_idf_matTestBody[len(invDictIdTest) - len(invDictIdTest)%chunk_size:,:], tf_idf_body)
for row_idx,row in enumerate(cs_title):
    rel_tags = row.argsort()[:-20:-1]
    ar = []
    words = []       
    for idx in testQTitle[row_idx,:].nonzero()[1]:
        words.append(invDictWords[idx])
    for idx in rel_tags:
        if invDictKeys[idx] in words:
            ar.append(invDictKeys[idx])
    for idx in rel_tags[0:2]:
        if invDictKeys[idx] not in ar:
            ar.append(invDictKeys[idx])
    result_tags[invDictIdTest[countRows%200000]] = " ".join(ar)
    countRows += 1 
""" save the resulting dict in file """
result_tagsSeries = pd.Series(result_tags)
result_tagsSeries.to_csv("resultTags_title_v3.csv")
save 3 most likely tags for each post:

In [ ]:
frange = ["0-200000","200000-400000","400000-600000","600000-800000","800000-1000000","1000000-1200000","1200000-1400000",
          "1400000-1600000","1600000-1800000","1800000-2000000","2000000-2013337"]
invDictKeys = getInvDict("invdictKeys.csv")
result_tags = {}
countRows = 0
""" choose a number for chunk_size that is a divisor of the number of rows in testWordsQ; 
    otherwise the iterator will skip the last entries!"""
chunk_size = 500
tStart = time.time()
for fran in frange:
    fname = "invdictIdTest_" + fran + ".csv"
    """ make sure that invDictIdTest is sorted!!! use getInvDict or sort dict after using getDict """
    invDictIdTest = getInvDict(fname)
    fname = "testWordsQTitle_" + fran + ".mtx"
    testQTitle = mmread(fname)
    tf_idf_matTestTitle = tfidf.fit_transform(testQTitle)
    fname = "testWordsQBody_" + fran + ".mtx"
    testQBody = mmread(fname)
    testQBody = testQBody.tocsr()
    tf_idf_matTestBody = tfidf.fit_transform(testQBody)
    del testQTitle
    del testQBody
    gc.collect()
    for idx_chunk in xrange(len(invDictIdTest)/chunk_size):
        cs_title = linear_kernel(tf_idf_matTestTitle[chunk_size*idx_chunk:chunk_size*(idx_chunk+1),:], tf_idf_title)
        cs_body = linear_kernel(tf_idf_matTestBody[chunk_size*idx_chunk:chunk_size*(idx_chunk+1),:], tf_idf_body)
        for row in cs_title:
            rel_tags = (row).argsort()[:-4:-1]
            ar = []
            for idx in rel_tags:
                ar.append(invDictKeys[idx])
            result_tags[invDictIdTest[countRows%200000]] = " ".join(ar)
            countRows += 1     
        del cs_title
        gc.collect()
        if countRows % 20000 == 0:
            print("{0:d} questions finished in {1:.0f}s".format(countRows, time.time()-tStart))
            tStart = time.time()
""" the linear_kernel below is necessary because matrix size 13337 is not divisible by chunk_size;
    it will skip the last part in the for loop above and i have to do it manually below """
cs_title = linear_kernel(tf_idf_matTestTitle[len(invDictIdTest) - len(invDictIdTest)%chunk_size:,:], tf_idf_title)
cs_body = linear_kernel(tf_idf_matTestBody[len(invDictIdTest) - len(invDictIdTest)%chunk_size:,:], tf_idf_body)
for lst in cs_title:
    rel_tags = lst.argsort()[:-4:-1]
    for key in rel_tags:
        l.append(invDictKeys[key])
    result_tags[invDictIdTest[countRows%200000]] = " ".join(l)
    countRows += 1 
""" save the resulting dict in file """
result_tagsSeries = pd.Series(result_tags)
result_tagsSeries.to_csv("resultTags_T+B_v2.csv")

In [37]:
result_tagsSeries_1 = getRealDict("resultTags_T+B_v2.csv")
#result_tagsSeries_1 = pd.Series(result_tagsSeries_1)

In [38]:
result_tagsSeries_2 = getRealDict("resultTags_T+B_v2_2.csv")
#result_tagsSeries_2 = pd.Series(result_tagsSeries_2)

In [39]:
result_tagsSeries_full = dict(list(result_tagsSeries_1.items()) + list(result_tagsSeries_2.items()))

In [40]:
result_tagsSeries_full1 = pd.Series(result_tagsSeries_full)

In [43]:
result_tagsSeries_full1.to_csv("resultTags_T+B_v2_full.csv")
change prediction found by cosine similarity for duplicates to values of train set:

In [9]:
duplicates = pd.read_csv("duplicates_single_v2.csv", index_col='Unnamed: 0')

In [10]:
dup = pd.Series(duplicates['Tags'].values, index=duplicates['Id_x'])
result = getRealDict("resultTags_title_v3.csv")

In [11]:
for key, value in dup.iteritems():
    result[key] = value

In [12]:
save_results(result, "submission_tfidf_titleonly_v3.csv")